home *** CD-ROM | disk | FTP | other *** search
/ PC World Komputer 2010 April / PCWorld0410.iso / hity wydania / Ubuntu 9.10 PL / karmelkowy-koliberek-9.10-netbook-remix-PL.iso / casper / filesystem.squashfs / usr / lib / pymodules / python2.6 / BeautifulSoupTests.pyc (.txt) < prev    next >
Python Compiled Bytecode  |  2009-11-02  |  38KB  |  782 lines

  1. # Source Generated with Decompyle++
  2. # File: in.pyc (Python 2.6)
  3.  
  4. '''Unit tests for Beautiful Soup.
  5.  
  6. These tests make sure the Beautiful Soup works as it should. If you
  7. find a bug in Beautiful Soup, the best way to express it is as a test
  8. case like this that fails.'''
  9. import unittest
  10. from BeautifulSoup import *
  11.  
  12. class SoupTest(unittest.TestCase):
  13.     
  14.     def assertSoupEquals(self, toParse, rep = None, c = BeautifulSoup, encoding = None):
  15.         '''Parse the given text and make sure its string rep is the other
  16.         given text.'''
  17.         if rep == None:
  18.             rep = toParse
  19.         
  20.         obj = c(toParse)
  21.         if encoding is None:
  22.             rep2 = obj.decode()
  23.         else:
  24.             rep2 = obj.encode(encoding)
  25.         self.assertEqual(rep2, rep)
  26.  
  27.  
  28.  
  29. class FollowThatTag(SoupTest):
  30.     '''Tests the various ways of fetching tags from a soup.'''
  31.     
  32.     def setUp(self):
  33.         ml = '\n        <a id="x">1</a>\n        <A id="a">2</a>\n        <b id="b">3</a>\n        <b href="foo" id="x">4</a>\n        <ac width=100>4</ac>'
  34.         self.soup = BeautifulStoneSoup(ml)
  35.  
  36.     
  37.     def testFindAllByName(self):
  38.         matching = self.soup('a')
  39.         self.assertEqual(len(matching), 2)
  40.         self.assertEqual(matching[0].name, 'a')
  41.         self.assertEqual(matching, self.soup.findAll('a'))
  42.         self.assertEqual(matching, self.soup.findAll(SoupStrainer('a')))
  43.  
  44.     
  45.     def testFindAllByAttribute(self):
  46.         matching = self.soup.findAll(id = 'x')
  47.         self.assertEqual(len(matching), 2)
  48.         self.assertEqual(matching[0].name, 'a')
  49.         self.assertEqual(matching[1].name, 'b')
  50.         matching2 = self.soup.findAll(attrs = {
  51.             'id': 'x' })
  52.         self.assertEqual(matching, matching2)
  53.         strainer = SoupStrainer(attrs = {
  54.             'id': 'x' })
  55.         self.assertEqual(matching, self.soup.findAll(strainer))
  56.         self.assertEqual(len(self.soup.findAll(id = None)), 1)
  57.         self.assertEqual(len(self.soup.findAll(width = 100)), 1)
  58.         self.assertEqual(len(self.soup.findAll(junk = None)), 5)
  59.         self.assertEqual(len(self.soup.findAll(junk = [
  60.             1,
  61.             None])), 5)
  62.         self.assertEqual(len(self.soup.findAll(junk = re.compile('.*'))), 0)
  63.         self.assertEqual(len(self.soup.findAll(junk = True)), 0)
  64.         self.assertEqual(len(self.soup.findAll(junk = True)), 0)
  65.         self.assertEqual(len(self.soup.findAll(href = True)), 1)
  66.  
  67.     
  68.     def testFindallByClass(self):
  69.         soup = BeautifulSoup('<a>Foo</a><a class="1">Bar</a>')
  70.         self.assertEqual(soup.find('a', '1').string, 'Bar')
  71.  
  72.     
  73.     def testFindAllByList(self):
  74.         matching = self.soup([
  75.             'a',
  76.             'ac'])
  77.         self.assertEqual(len(matching), 3)
  78.  
  79.     
  80.     def testFindAllByHash(self):
  81.         matching = self.soup({
  82.             'a': True,
  83.             'b': True })
  84.         self.assertEqual(len(matching), 4)
  85.  
  86.     
  87.     def testFindAllText(self):
  88.         soup = BeautifulSoup('<html>\xbb</html>')
  89.         self.assertEqual(soup.findAll(text = re.compile('.*')), [
  90.             u'\xc2\xbb'])
  91.  
  92.     
  93.     def testFindAllByRE(self):
  94.         import re
  95.         r = re.compile('a.*')
  96.         self.assertEqual(len(self.soup(r)), 3)
  97.  
  98.     
  99.     def testFindAllByMethod(self):
  100.         
  101.         def matchTagWhereIDMatchesName(tag):
  102.             return tag.name == tag.get('id')
  103.  
  104.         matching = self.soup.findAll(matchTagWhereIDMatchesName)
  105.         self.assertEqual(len(matching), 2)
  106.         self.assertEqual(matching[0].name, 'a')
  107.  
  108.     
  109.     def testParents(self):
  110.         soup = BeautifulSoup('<ul id="foo"></ul><ul id="foo"><ul><ul id="foo" a="b"><b>Blah')
  111.         b = soup.b
  112.         self.assertEquals(len(b.findParents('ul', {
  113.             'id': 'foo' })), 2)
  114.         self.assertEquals(b.findParent('ul')['a'], 'b')
  115.  
  116.     PROXIMITY_TEST = BeautifulSoup('<b id="1"><b id="2"><b id="3"><b id="4">')
  117.     
  118.     def testNext(self):
  119.         soup = self.PROXIMITY_TEST
  120.         b = soup.find('b', {
  121.             'id': 2 })
  122.         self.assertEquals(b.findNext('b')['id'], '3')
  123.         self.assertEquals(b.findNext('b')['id'], '3')
  124.         self.assertEquals(len(b.findAllNext('b')), 2)
  125.         self.assertEquals(len(b.findAllNext('b', {
  126.             'id': 4 })), 1)
  127.  
  128.     
  129.     def testPrevious(self):
  130.         soup = self.PROXIMITY_TEST
  131.         b = soup.find('b', {
  132.             'id': 3 })
  133.         self.assertEquals(b.findPrevious('b')['id'], '2')
  134.         self.assertEquals(b.findPrevious('b')['id'], '2')
  135.         self.assertEquals(len(b.findAllPrevious('b')), 2)
  136.         self.assertEquals(len(b.findAllPrevious('b', {
  137.             'id': 2 })), 1)
  138.  
  139.     SIBLING_TEST = BeautifulSoup('<blockquote id="1"><blockquote id="1.1"></blockquote></blockquote><blockquote id="2"><blockquote id="2.1"></blockquote></blockquote><blockquote id="3"><blockquote id="3.1"></blockquote></blockquote><blockquote id="4">')
  140.     
  141.     def testNextSibling(self):
  142.         soup = self.SIBLING_TEST
  143.         tag = 'blockquote'
  144.         b = soup.find(tag, {
  145.             'id': 2 })
  146.         self.assertEquals(b.findNext(tag)['id'], '2.1')
  147.         self.assertEquals(b.findNextSibling(tag)['id'], '3')
  148.         self.assertEquals(b.findNextSibling(tag)['id'], '3')
  149.         self.assertEquals(len(b.findNextSiblings(tag)), 2)
  150.         self.assertEquals(len(b.findNextSiblings(tag, {
  151.             'id': 4 })), 1)
  152.  
  153.     
  154.     def testPreviousSibling(self):
  155.         soup = self.SIBLING_TEST
  156.         tag = 'blockquote'
  157.         b = soup.find(tag, {
  158.             'id': 3 })
  159.         self.assertEquals(b.findPrevious(tag)['id'], '2.1')
  160.         self.assertEquals(b.findPreviousSibling(tag)['id'], '2')
  161.         self.assertEquals(b.findPreviousSibling(tag)['id'], '2')
  162.         self.assertEquals(len(b.findPreviousSiblings(tag)), 2)
  163.         self.assertEquals(len(b.findPreviousSiblings(tag, id = 1)), 1)
  164.  
  165.     
  166.     def testTextNavigation(self):
  167.         soup = BeautifulSoup('Foo<b>Bar</b><i id="1"><b>Baz<br />Blee<hr id="1"/></b></i>Blargh')
  168.         baz = soup.find(text = 'Baz')
  169.         self.assertEquals(baz.findParent('i')['id'], '1')
  170.         self.assertEquals(baz.findNext(text = 'Blee'), 'Blee')
  171.         self.assertEquals(baz.findNextSibling(text = 'Blee'), 'Blee')
  172.         self.assertEquals(baz.findNextSibling(text = 'Blargh'), None)
  173.         self.assertEquals(baz.findNextSibling('hr')['id'], '1')
  174.  
  175.  
  176.  
  177. class SiblingRivalry(SoupTest):
  178.     '''Tests the nextSibling and previousSibling navigation.'''
  179.     
  180.     def testSiblings(self):
  181.         soup = BeautifulSoup('<ul><li>1<p>A</p>B<li>2<li>3</ul>')
  182.         secondLI = soup.find('li').nextSibling
  183.         if secondLI.name == 'li':
  184.             pass
  185.         self.assert_(secondLI.string == '2')
  186.         self.assertEquals(soup.find(text = '1').nextSibling.name, 'p')
  187.         self.assertEquals(soup.find('p').nextSibling, 'B')
  188.         self.assertEquals(soup.find('p').nextSibling.previousSibling.nextSibling, 'B')
  189.  
  190.  
  191.  
  192. class TagsAreObjectsToo(SoupTest):
  193.     '''Tests the various built-in functions of Tag objects.'''
  194.     
  195.     def testLen(self):
  196.         soup = BeautifulSoup('<top>1<b>2</b>3</top>')
  197.         self.assertEquals(len(soup.top), 3)
  198.  
  199.  
  200.  
  201. class StringEmUp(SoupTest):
  202.     """Tests the use of 'string' as an alias for a tag's only content."""
  203.     
  204.     def testString(self):
  205.         s = BeautifulSoup('<b>foo</b>')
  206.         self.assertEquals(s.b.string, 'foo')
  207.  
  208.     
  209.     def testLackOfString(self):
  210.         s = BeautifulSoup('<b>f<i>e</i>o</b>')
  211.         self.assert_(not (s.b.string))
  212.  
  213.  
  214.  
  215. class ThatsMyLimit(SoupTest):
  216.     '''Tests the limit argument.'''
  217.     
  218.     def testBasicLimits(self):
  219.         s = BeautifulSoup('<br id="1" /><br id="1" /><br id="1" /><br id="1" />')
  220.         self.assertEquals(len(s.findAll('br')), 4)
  221.         self.assertEquals(len(s.findAll('br', limit = 2)), 2)
  222.         self.assertEquals(len(s('br', limit = 2)), 2)
  223.  
  224.  
  225.  
  226. class OnlyTheLonely(SoupTest):
  227.     '''Tests the parseOnly argument to the constructor.'''
  228.     
  229.     def setUp(self):
  230.         x = []
  231.         for i in range(1, 6):
  232.             x.append('<a id="%s">' % i)
  233.             for j in range(100, 103):
  234.                 x.append('<b id="%s.%s">Content %s.%s</b>' % (i, j, i, j))
  235.             
  236.             x.append('</a>')
  237.         
  238.         self.x = ''.join(x)
  239.  
  240.     
  241.     def testOnly(self):
  242.         strainer = SoupStrainer('b')
  243.         soup = BeautifulSoup(self.x, parseOnlyThese = strainer)
  244.         self.assertEquals(len(soup), 15)
  245.         strainer = SoupStrainer(id = re.compile('100.*'))
  246.         soup = BeautifulSoup(self.x, parseOnlyThese = strainer)
  247.         self.assertEquals(len(soup), 5)
  248.         strainer = SoupStrainer(text = re.compile('10[01].*'))
  249.         soup = BeautifulSoup(self.x, parseOnlyThese = strainer)
  250.         self.assertEquals(len(soup), 10)
  251.         strainer = SoupStrainer(text = (lambda x: x[8] == '3'))
  252.         soup = BeautifulSoup(self.x, parseOnlyThese = strainer)
  253.         self.assertEquals(len(soup), 3)
  254.  
  255.  
  256.  
  257. class PickleMeThis(SoupTest):
  258.     '''Testing features like pickle and deepcopy.'''
  259.     
  260.     def setUp(self):
  261.         self.page = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"\n"http://www.w3.org/TR/REC-html40/transitional.dtd">\n<html>\n<head>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n<title>Beautiful Soup: We called him Tortoise because he taught us.</title>\n<link rev="made" href="mailto:leonardr@segfault.org">\n<meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping.">\n<meta name="generator" content="Markov Approximation 1.4 (module: leonardr)">\n<meta name="author" content="Leonard Richardson">\n</head>\n<body>\n<a href="foo">foo</a>\n<a href="foo"><b>bar</b></a>\n</body>\n</html>'
  262.         self.soup = BeautifulSoup(self.page)
  263.  
  264.     
  265.     def testPickle(self):
  266.         import pickle as pickle
  267.         dumped = pickle.dumps(self.soup, 2)
  268.         loaded = pickle.loads(dumped)
  269.         self.assertEqual(loaded.__class__, BeautifulSoup)
  270.         self.assertEqual(loaded.decode(), self.soup.decode())
  271.  
  272.     
  273.     def testDeepcopy(self):
  274.         deepcopy = deepcopy
  275.         import copy
  276.         deepcopy(BeautifulSoup('<a></a>'))
  277.         copied = deepcopy(self.soup)
  278.         self.assertEqual(copied.decode(), self.soup.decode())
  279.  
  280.     
  281.     def testUnicodePickle(self):
  282.         import cPickle as pickle
  283.         html = '<b>' + chr(195) + '</b>'
  284.         soup = BeautifulSoup(html)
  285.         dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL)
  286.         loaded = pickle.loads(dumped)
  287.         self.assertEqual(loaded.decode(), soup.decode())
  288.  
  289.  
  290.  
  291. class WriteOnlyCode(SoupTest):
  292.     '''Testing the modification of the tree.'''
  293.     
  294.     def testModifyAttributes(self):
  295.         soup = BeautifulSoup('<a id="1"></a>')
  296.         soup.a['id'] = 2
  297.         self.assertEqual(soup.decode(), '<a id="2"></a>')
  298.         del soup.a['id']
  299.         self.assertEqual(soup.decode(), '<a></a>')
  300.         soup.a['id2'] = 'foo'
  301.         self.assertEqual(soup.decode(), '<a id2="foo"></a>')
  302.  
  303.     
  304.     def testNewTagCreation(self):
  305.         """Makes sure tags don't step on each others' toes."""
  306.         soup = BeautifulSoup()
  307.         a = Tag(soup, 'a')
  308.         ol = Tag(soup, 'ol')
  309.         a['href'] = 'http://foo.com/'
  310.         self.assertRaises((KeyError,), (lambda : ol['href']))
  311.  
  312.     
  313.     def testTagReplacement(self):
  314.         text = '<a><b></b><c>Foo<d></d></c></a><a><e></e></a>'
  315.         soup = BeautifulSoup(text)
  316.         c = soup.c
  317.         soup.c.replaceWith(c)
  318.         self.assertEquals(soup.decode(), text)
  319.         soup = BeautifulSoup('<b>Argh!</b>')
  320.         soup.find(text = 'Argh!').replaceWith('Hooray!')
  321.         newText = soup.find(text = 'Hooray!')
  322.         b = soup.b
  323.         self.assertEqual(newText.previous, b)
  324.         self.assertEqual(newText.parent, b)
  325.         self.assertEqual(newText.previous.next, newText)
  326.         self.assertEqual(newText.next, None)
  327.         soup = BeautifulSoup('<a><b>Argh!</b><c></c><d></d></a>')
  328.         soup.b.insert(1, 'Hooray!')
  329.         newText = soup.find(text = 'Hooray!')
  330.         self.assertEqual(newText.previous, 'Argh!')
  331.         self.assertEqual(newText.previous.next, newText)
  332.         self.assertEqual(newText.previousSibling, 'Argh!')
  333.         self.assertEqual(newText.previousSibling.nextSibling, newText)
  334.         self.assertEqual(newText.nextSibling, None)
  335.         self.assertEqual(newText.next, soup.c)
  336.         text = "<html>There's <b>no</b> business like <b>show</b> business</html>"
  337.         soup = BeautifulSoup(text)
  338.         (no, show) = soup.findAll('b')
  339.         show.replaceWith(no)
  340.         self.assertEquals(soup.decode(), "<html>There's  business like <b>no</b> business</html>")
  341.         soup = BeautifulSoup('<a><b>Find</b><c>lady!</c><d></d></a>')
  342.         tag = Tag(soup, 'magictag')
  343.         tag.insert(0, 'the')
  344.         soup.a.insert(1, tag)
  345.         b = soup.b
  346.         c = soup.c
  347.         theText = tag.find(text = True)
  348.         findText = b.find(text = 'Find')
  349.         self.assertEqual(findText.next, tag)
  350.         self.assertEqual(tag.previous, findText)
  351.         self.assertEqual(b.nextSibling, tag)
  352.         self.assertEqual(tag.previousSibling, b)
  353.         self.assertEqual(tag.nextSibling, c)
  354.         self.assertEqual(c.previousSibling, tag)
  355.         self.assertEqual(theText.next, c)
  356.         self.assertEqual(c.previous, theText)
  357.         soup = BeautifulSoup('<a>We<b>reserve<c>the</c><d>right</d></b></a><e>to<f>refuse</f><g>service</g></e>')
  358.         f = soup.f
  359.         a = soup.a
  360.         c = soup.c
  361.         e = soup.e
  362.         weText = a.find(text = 'We')
  363.         soup.b.replaceWith(soup.f)
  364.         self.assertEqual(soup.decode(), '<a>We<f>refuse</f></a><e>to<g>service</g></e>')
  365.         self.assertEqual(f.previous, weText)
  366.         self.assertEqual(weText.next, f)
  367.         self.assertEqual(f.previousSibling, weText)
  368.         self.assertEqual(f.nextSibling, None)
  369.         self.assertEqual(weText.nextSibling, f)
  370.  
  371.     
  372.     def testAppend(self):
  373.         doc = "<p>Don't leave me <b>here</b>.</p> <p>Don't leave me.</p>"
  374.         soup = BeautifulSoup(doc)
  375.         second_para = soup('p')[1]
  376.         bold = soup.find('b')
  377.         soup('p')[1].append(soup.find('b'))
  378.         self.assertEqual(bold.parent, second_para)
  379.         self.assertEqual(soup.decode(), "<p>Don't leave me .</p> <p>Don't leave me.<b>here</b></p>")
  380.  
  381.     
  382.     def testTagExtraction(self):
  383.         text = '<html><div id="nav">Nav crap</div>Real content here.</html>'
  384.         soup = BeautifulSoup(text)
  385.         extracted = soup.find('div', id = 'nav').extract()
  386.         self.assertEqual(soup.decode(), '<html>Real content here.</html>')
  387.         self.assertEqual(extracted.decode(), '<div id="nav">Nav crap</div>')
  388.         text = '<doc><a>1<b>2</b></a><a>i<b>ii</b></a><a>A<b>B</b></a></doc>'
  389.         soup = BeautifulStoneSoup(text)
  390.         doc = soup.doc
  391.         (numbers, roman, letters) = soup('a')
  392.         self.assertEqual(roman.parent, doc)
  393.         oldPrevious = roman.previous
  394.         endOfThisTag = roman.nextSibling.previous
  395.         self.assertEqual(oldPrevious, '2')
  396.         self.assertEqual(roman.next, 'i')
  397.         self.assertEqual(endOfThisTag, 'ii')
  398.         self.assertEqual(roman.previousSibling, numbers)
  399.         self.assertEqual(roman.nextSibling, letters)
  400.         roman.extract()
  401.         self.assertEqual(roman.parent, None)
  402.         self.assertEqual(roman.previous, None)
  403.         self.assertEqual(roman.next, 'i')
  404.         self.assertEqual(letters.previous, '2')
  405.         self.assertEqual(roman.previousSibling, None)
  406.         self.assertEqual(roman.nextSibling, None)
  407.         self.assertEqual(endOfThisTag.next, None)
  408.         self.assertEqual(roman.b.contents[0].next, None)
  409.         self.assertEqual(numbers.nextSibling, letters)
  410.         self.assertEqual(letters.previousSibling, numbers)
  411.         self.assertEqual(len(doc.contents), 2)
  412.         self.assertEqual(doc.contents[0], numbers)
  413.         self.assertEqual(doc.contents[1], letters)
  414.         text = '<a>1<b>2<c>Hollywood, baby!</c></b></a>3'
  415.         soup = BeautifulStoneSoup(text)
  416.         one = soup.find(text = '1')
  417.         three = soup.find(text = '3')
  418.         toExtract = soup.b
  419.         soup.b.extract()
  420.         self.assertEqual(one.next, three)
  421.         self.assertEqual(three.previous, one)
  422.         self.assertEqual(one.parent.nextSibling, three)
  423.         self.assertEqual(three.previousSibling, soup.a)
  424.  
  425.  
  426.  
  427. class TheManWithoutAttributes(SoupTest):
  428.     '''Test attribute access'''
  429.     
  430.     def testHasKey(self):
  431.         text = "<foo attr='bar'>"
  432.         self.assertTrue(BeautifulSoup(text).foo.has_key('attr'))
  433.  
  434.  
  435.  
  436. class QuoteMeOnThat(SoupTest):
  437.     '''Test quoting'''
  438.     
  439.     def testQuotedAttributeValues(self):
  440.         self.assertSoupEquals("<foo attr='bar'></foo>", '<foo attr="bar"></foo>')
  441.         text = '<foo attr=\'bar "brawls" happen\'>a</foo>'
  442.         soup = BeautifulSoup(text)
  443.         self.assertEquals(soup.decode(), text)
  444.         soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
  445.         newText = '<foo attr=\'Brawls happen at "Bob&squot;s Bar"\'>a</foo>'
  446.         self.assertSoupEquals(soup.decode(), newText)
  447.         self.assertSoupEquals('<this is="really messed up & stuff">', '<this is="really messed up & stuff"></this>')
  448.  
  449.  
  450.  
  451. class YoureSoLiteral(SoupTest):
  452.     '''Test literal mode.'''
  453.     
  454.     def testLiteralMode(self):
  455.         text = '<script>if (i<imgs.length)</script><b>Foo</b>'
  456.         soup = BeautifulSoup(text)
  457.         self.assertEqual(soup.script.contents[0], 'if (i<imgs.length)')
  458.         self.assertEqual(soup.b.contents[0], 'Foo')
  459.  
  460.     
  461.     def testTextArea(self):
  462.         text = '<textarea><b>This is an example of an HTML tag</b><&<&</textarea>'
  463.         soup = BeautifulSoup(text)
  464.         self.assertEqual(soup.textarea.contents[0], '<b>This is an example of an HTML tag</b><&<&')
  465.  
  466.  
  467.  
  468. class OperatorOverload(SoupTest):
  469.     '''Our operators do it all! Call now!'''
  470.     
  471.     def testTagNameAsFind(self):
  472.         '''Tests that referencing a tag name as a member delegates to find().'''
  473.         soup = BeautifulSoup('<b id="1">foo<i>bar</i></b><b>Red herring</b>')
  474.         self.assertEqual(soup.b.i, soup.find('b').find('i'))
  475.         self.assertEqual(soup.b.i.string, 'bar')
  476.         self.assertEqual(soup.b['id'], '1')
  477.         self.assertEqual(soup.b.contents[0], 'foo')
  478.         self.assert_(not (soup.a))
  479.         self.assertEqual(soup.bTag.iTag.string, 'bar')
  480.         self.assertEqual(soup.b.iTag.string, 'bar')
  481.         self.assertEqual(soup.find('b').find('i'), soup.bTag.iTag)
  482.  
  483.  
  484.  
  485. class NestableEgg(SoupTest):
  486.     '''Here we test tag nesting. TEST THE NEST, DUDE! X-TREME!'''
  487.     
  488.     def testParaInsideBlockquote(self):
  489.         soup = BeautifulSoup('<blockquote><p><b>Foo</blockquote><p>Bar')
  490.         self.assertEqual(soup.blockquote.p.b.string, 'Foo')
  491.         self.assertEqual(soup.blockquote.b.string, 'Foo')
  492.         self.assertEqual(soup.find('p', recursive = False).string, 'Bar')
  493.  
  494.     
  495.     def testNestedTables(self):
  496.         text = '<table id="1"><tr><td>Here\'s another table:\n        <table id="2"><tr><td>Juicy text</td></tr></table></td></tr></table>'
  497.         soup = BeautifulSoup(text)
  498.         self.assertEquals(soup.table.table.td.string, 'Juicy text')
  499.         self.assertEquals(len(soup.findAll('table')), 2)
  500.         self.assertEquals(len(soup.table.findAll('table')), 1)
  501.         self.assertEquals(soup.find('table', {
  502.             'id': 2 }).parent.parent.parent.name, 'table')
  503.         text = '<table><tr><td><div><table>Foo</table></div></td></tr></table>'
  504.         soup = BeautifulSoup(text)
  505.         self.assertEquals(soup.table.tr.td.div.table.contents[0], 'Foo')
  506.         text = '<table><thead><tr>Foo</tr></thead><tbody><tr>Bar</tr></tbody>\n        <tfoot><tr>Baz</tr></tfoot></table>'
  507.         soup = BeautifulSoup(text)
  508.         self.assertEquals(soup.table.thead.tr.contents[0], 'Foo')
  509.  
  510.     
  511.     def testBadNestedTables(self):
  512.         soup = BeautifulSoup("<table><tr><table><tr id='nested'>")
  513.         self.assertEquals(soup.table.tr.table.tr['id'], 'nested')
  514.  
  515.  
  516.  
  517. class CleanupOnAisleFour(SoupTest):
  518.     '''Here we test cleanup of text that breaks HTMLParser or is just
  519.     obnoxious.'''
  520.     
  521.     def testSelfClosingtag(self):
  522.         self.assertEqual(BeautifulSoup('Foo<br/>Bar').find('br').decode(), '<br />')
  523.         self.assertSoupEquals('<p>test1<br/>test2</p>', '<p>test1<br />test2</p>')
  524.         text = '<p>test1<selfclosing>test2'
  525.         soup = BeautifulStoneSoup(text)
  526.         self.assertEqual(soup.decode(), '<p>test1<selfclosing>test2</selfclosing></p>')
  527.         soup = BeautifulStoneSoup(text, selfClosingTags = 'selfclosing')
  528.         self.assertEqual(soup.decode(), '<p>test1<selfclosing />test2</p>')
  529.  
  530.     
  531.     def testSelfClosingTagOrNot(self):
  532.         text = '<item><link>http://foo.com/</link></item>'
  533.         self.assertEqual(BeautifulStoneSoup(text).decode(), text)
  534.         self.assertEqual(BeautifulSoup(text).decode(), '<item><link />http://foo.com/</item>')
  535.  
  536.     
  537.     def testBooleanAttributes(self):
  538.         text = '<td nowrap>foo</td>'
  539.         self.assertSoupEquals(text, text)
  540.  
  541.     
  542.     def testCData(self):
  543.         xml = '<root>foo<![CDATA[foobar]]>bar</root>'
  544.         self.assertSoupEquals(xml, xml)
  545.         r = re.compile('foo.*bar')
  546.         soup = BeautifulSoup(xml)
  547.         self.assertEquals(soup.find(text = r).string, 'foobar')
  548.         self.assertEquals(soup.find(text = r).__class__, CData)
  549.  
  550.     
  551.     def testComments(self):
  552.         xml = 'foo<!--foobar-->baz'
  553.         self.assertSoupEquals(xml)
  554.         r = re.compile('foo.*bar')
  555.         soup = BeautifulSoup(xml)
  556.         self.assertEquals(soup.find(text = r).string, 'foobar')
  557.         self.assertEquals(soup.find(text = 'foobar').__class__, Comment)
  558.  
  559.     
  560.     def testDeclaration(self):
  561.         xml = 'foo<!DOCTYPE foobar>baz'
  562.         self.assertSoupEquals(xml)
  563.         r = re.compile('.*foo.*bar')
  564.         soup = BeautifulSoup(xml)
  565.         text = 'DOCTYPE foobar'
  566.         self.assertEquals(soup.find(text = r).string, text)
  567.         self.assertEquals(soup.find(text = text).__class__, Declaration)
  568.         namespaced_doctype = '<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd"><html>foo</html>'
  569.         soup = BeautifulSoup(namespaced_doctype)
  570.         self.assertEquals(soup.contents[0], 'DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd"')
  571.         self.assertEquals(soup.html.contents[0], 'foo')
  572.  
  573.     
  574.     def testEntityConversions(self):
  575.         text = '<<sacré bleu!>>'
  576.         soup = BeautifulStoneSoup(text)
  577.         self.assertSoupEquals(text)
  578.         xmlEnt = BeautifulStoneSoup.XML_ENTITIES
  579.         htmlEnt = BeautifulStoneSoup.HTML_ENTITIES
  580.         xhtmlEnt = BeautifulStoneSoup.XHTML_ENTITIES
  581.         soup = BeautifulStoneSoup(text, convertEntities = xmlEnt)
  582.         self.assertEquals(soup.decode(), '<<sacré bleu!>>')
  583.         soup = BeautifulStoneSoup(text, convertEntities = xmlEnt)
  584.         self.assertEquals(soup.decode(), '<<sacré bleu!>>')
  585.         soup = BeautifulStoneSoup(text, convertEntities = htmlEnt)
  586.         self.assertEquals(soup.decode(), u'<<sacr\xc3\xa9 bleu!>>')
  587.         text = '<™''
  588.         soup = BeautifulStoneSoup(text, convertEntities = xmlEnt)
  589.         self.assertEquals(soup.decode(), u"<™'")
  590.         soup = BeautifulStoneSoup(text, convertEntities = htmlEnt)
  591.         self.assertEquals(soup.decode(), u'<\xe2\x84\xa2'')
  592.         soup = BeautifulStoneSoup(text, convertEntities = xhtmlEnt)
  593.         self.assertEquals(soup.decode(), u"<\xe2\x84\xa2'")
  594.  
  595.     
  596.     def testNonBreakingSpaces(self):
  597.         soup = BeautifulSoup('<a>  </a>', convertEntities = BeautifulStoneSoup.HTML_ENTITIES)
  598.         self.assertEquals(soup.decode(), u'<a>\xc2\xa0\xc2\xa0</a>')
  599.  
  600.     
  601.     def testWhitespaceInDeclaration(self):
  602.         self.assertSoupEquals('<! DOCTYPE>', '<!DOCTYPE>')
  603.  
  604.     
  605.     def testJunkInDeclaration(self):
  606.         self.assertSoupEquals('<! Foo = -8>a', '<!Foo = -8>a')
  607.  
  608.     
  609.     def testIncompleteDeclaration(self):
  610.         self.assertSoupEquals('a<!b <p>c')
  611.  
  612.     
  613.     def testEntityReplacement(self):
  614.         self.assertSoupEquals('<b>hello there</b>')
  615.  
  616.     
  617.     def testEntitiesInAttributeValues(self):
  618.         self.assertSoupEquals('<x t="xñ">', '<x t="x\xc3\xb1"></x>', encoding = 'utf-8')
  619.         self.assertSoupEquals('<x t="xñ">', '<x t="x\xc3\xb1"></x>', encoding = 'utf-8')
  620.         soup = BeautifulSoup('<x t=">™">', convertEntities = BeautifulStoneSoup.HTML_ENTITIES)
  621.         self.assertEquals(soup.decode(), u'<x t=">\xe2\x84\xa2"></x>')
  622.         uri = 'http://crummy.com?sacré&bleu'
  623.         link = '<a href="%s"></a>' % uri
  624.         soup = BeautifulSoup(link, convertEntities = BeautifulSoup.HTML_ENTITIES)
  625.         self.assertEquals(soup.decode(), link.replace('é', u'\xc3\xa9'))
  626.         uri = 'http://crummy.com?sacré&bleu'
  627.         link = '<a href="%s"></a>' % uri
  628.         soup = BeautifulSoup(link, convertEntities = BeautifulSoup.HTML_ENTITIES)
  629.         self.assertEquals(soup.a['href'], uri.replace('é', u'\xc3\xa9'))
  630.  
  631.     
  632.     def testNakedAmpersands(self):
  633.         html = {
  634.             'convertEntities': BeautifulStoneSoup.HTML_ENTITIES }
  635.         soup = BeautifulStoneSoup('AT&T ', **html)
  636.         self.assertEquals(soup.decode(), 'AT&T ')
  637.         nakedAmpersandInASentence = 'AT&T was Ma Bell'
  638.         soup = BeautifulStoneSoup(nakedAmpersandInASentence, **html)
  639.         self.assertEquals(soup.decode(), nakedAmpersandInASentence.replace('&', '&'))
  640.         invalidURL = '<a href="http://example.org?a=1&b=2;3">foo</a>'
  641.         validURL = invalidURL.replace('&', '&')
  642.         soup = BeautifulStoneSoup(invalidURL)
  643.         self.assertEquals(soup.decode(), validURL)
  644.         soup = BeautifulStoneSoup(validURL)
  645.         self.assertEquals(soup.decode(), validURL)
  646.  
  647.  
  648.  
  649. class EncodeRed(SoupTest):
  650.     '''Tests encoding conversion, Unicode conversion, and Microsoft
  651.     smart quote fixes.'''
  652.     
  653.     def testUnicodeDammitStandalone(self):
  654.         markup = '<foo>\x92</foo>'
  655.         dammit = UnicodeDammit(markup)
  656.         self.assertEquals(dammit.unicode, '<foo>’</foo>')
  657.         hebrew = '\xed\xe5\xec\xf9'
  658.         dammit = UnicodeDammit(hebrew, [
  659.             'iso-8859-8'])
  660.         self.assertEquals(dammit.unicode, u'\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9')
  661.         self.assertEquals(dammit.originalEncoding, 'iso-8859-8')
  662.  
  663.     
  664.     def testGarbageInGarbageOut(self):
  665.         ascii = '<foo>a</foo>'
  666.         asciiSoup = BeautifulStoneSoup(ascii)
  667.         self.assertEquals(ascii, asciiSoup.decode())
  668.         unicodeData = u'<foo>\xc3\xbc</foo>'
  669.         utf8 = unicodeData.encode('utf-8')
  670.         self.assertEquals(utf8, '<foo>\xc3\xbc</foo>')
  671.         unicodeSoup = BeautifulStoneSoup(unicodeData)
  672.         self.assertEquals(unicodeData, unicodeSoup.decode())
  673.         self.assertEquals(unicodeSoup.foo.string, u'\xc3\xbc')
  674.         utf8Soup = BeautifulStoneSoup(utf8, fromEncoding = 'utf-8')
  675.         self.assertEquals(utf8, utf8Soup.encode('utf-8'))
  676.         self.assertEquals(utf8Soup.originalEncoding, 'utf-8')
  677.         utf8Soup = BeautifulStoneSoup(unicodeData)
  678.         self.assertEquals(utf8, utf8Soup.encode('utf-8'))
  679.         self.assertEquals(utf8Soup.originalEncoding, None)
  680.  
  681.     
  682.     def testHandleInvalidCodec(self):
  683.         for bad_encoding in [
  684.             '.utf8',
  685.             '...',
  686.             'utF---16.!']:
  687.             soup = BeautifulSoup(u'R\xc3\xa4ksm\xc3\xb6rg\xc3\xa5s'.encode('utf-8'), fromEncoding = bad_encoding)
  688.             self.assertEquals(soup.originalEncoding, 'utf-8')
  689.         
  690.  
  691.     
  692.     def testUnicodeSearch(self):
  693.         html = u'<html><body><h1>R\xc3\xa4ksm\xc3\xb6rg\xc3\xa5s</h1></body></html>'
  694.         soup = BeautifulSoup(html)
  695.         self.assertEqual(soup.find(text = u'R\xc3\xa4ksm\xc3\xb6rg\xc3\xa5s'), u'R\xc3\xa4ksm\xc3\xb6rg\xc3\xa5s')
  696.  
  697.     
  698.     def testRewrittenXMLHeader(self):
  699.         euc_jp = '<?xml version="1.0 encoding="euc-jp"?>\n<foo>\n\xa4\xb3\xa4\xec\xa4\xcfEUC-JP\xa4\xc7\xa5\xb3\xa1\xbc\xa5\xc7\xa5\xa3\xa5\xf3\xa5\xb0\xa4\xb5\xa4\xec\xa4\xbf\xc6\xfc\xcb\xdc\xb8\xec\xa4\xce\xa5\xd5\xa5\xa1\xa5\xa4\xa5\xeb\xa4\xc7\xa4\xb9\xa1\xa3\n</foo>\n'
  700.         utf8 = "<?xml version='1.0' encoding='utf-8'?>\n<foo>\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafEUC-JP\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n</foo>\n"
  701.         soup = BeautifulStoneSoup(euc_jp)
  702.         if soup.originalEncoding != 'euc-jp':
  703.             raise Exception("Test failed when parsing euc-jp document. If you're running Python >=2.4, or you have cjkcodecs installed, this is a real problem. Otherwise, ignore it.")
  704.         soup.originalEncoding != 'euc-jp'
  705.         self.assertEquals(soup.originalEncoding, 'euc-jp')
  706.         self.assertEquals(soup.renderContents('utf-8'), utf8)
  707.         old_text = "<?xml encoding='windows-1252'><foo>\x92</foo>"
  708.         new_text = "<?xml version='1.0' encoding='utf-8'?><foo>’</foo>"
  709.         self.assertSoupEquals(old_text, new_text)
  710.  
  711.     
  712.     def testRewrittenMetaTag(self):
  713.         no_shift_jis_html = '<html><head>\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n</pre></body></html>'
  714.         soup = BeautifulSoup(no_shift_jis_html)
  715.         strainer = SoupStrainer('pre')
  716.         soup = BeautifulSoup(no_shift_jis_html, parseOnlyThese = strainer)
  717.         self.assertEquals(soup.contents[0].name, 'pre')
  718.         meta_tag = '<meta content="text/html; charset=x-sjis" http-equiv="Content-type" />'
  719.         shift_jis_html = '<html><head>\n%s\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n</pre></body></html>' % meta_tag
  720.         soup = BeautifulSoup(shift_jis_html)
  721.         if soup.originalEncoding != 'shift-jis':
  722.             raise Exception("Test failed when parsing shift-jis document with meta tag '%s'.If you're running Python >=2.4, or you have cjkcodecs installed, this is a real problem. Otherwise, ignore it." % meta_tag)
  723.         soup.originalEncoding != 'shift-jis'
  724.         self.assertEquals(soup.originalEncoding, 'shift-jis')
  725.         content_type_tag = soup.meta['content']
  726.         self.assertEquals(content_type_tag[content_type_tag.find('charset='):], 'charset=%SOUP-ENCODING%')
  727.         content_type = str(soup.meta)
  728.         index = content_type.find('charset=')
  729.         self.assertEqual(content_type[index:index + len('charset=utf8') + 1], 'charset=utf-8')
  730.         content_type = soup.meta.encode('shift-jis')
  731.         index = content_type.find('charset=')
  732.         self.assertEqual(content_type[index:index + len('charset=shift-jis')], 'charset=shift-jis'.encode())
  733.         self.assertEquals(soup.encode('utf-8'), '<html><head>\n<meta content="text/html; charset=utf-8" http-equiv="Content-type" />\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafShift-JIS\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n</pre></body></html>')
  734.         self.assertEquals(soup.encode('shift-jis'), shift_jis_html.replace('x-sjis'.encode(), 'shift-jis'.encode()))
  735.         isolatin = '<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>'
  736.         soup = BeautifulSoup(isolatin)
  737.         utf8 = isolatin.replace('ISO-Latin-1'.encode(), 'utf-8'.encode())
  738.         utf8 = utf8.replace('\xe9', '\xc3\xa9')
  739.         self.assertSoupEquals(soup.encode('utf-8'), utf8, encoding = 'utf-8')
  740.  
  741.     
  742.     def testHebrew(self):
  743.         iso_8859_8 = '<HEAD>\n<TITLE>Hebrew (ISO 8859-8) in Visual Directionality</TITLE>\n\n\n\n</HEAD>\n<BODY>\n<H1>Hebrew (ISO 8859-8) in Visual Directionality</H1>\n\xed\xe5\xec\xf9\n</BODY>\n'
  744.         utf8 = '<head>\n<title>Hebrew (ISO 8859-8) in Visual Directionality</title>\n</head>\n<body>\n<h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\n\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9\n</body>\n'
  745.         soup = BeautifulStoneSoup(iso_8859_8, fromEncoding = 'iso-8859-8')
  746.         self.assertEquals(soup.encode('utf-8'), utf8)
  747.  
  748.     
  749.     def testSmartQuotesNotSoSmartAnymore(self):
  750.         self.assertSoupEquals('\x91Foo\x92 <!--blah-->', '‘Foo’ <!--blah-->')
  751.  
  752.     
  753.     def testDontConvertSmartQuotesWhenAlsoConvertingEntities(self):
  754.         smartQuotes = 'Il a dit, \x8bSacré bleu!\x9b'
  755.         soup = BeautifulSoup(smartQuotes)
  756.         self.assertEquals(soup.decode(), 'Il a dit, ‹Sacré bleu!›')
  757.         soup = BeautifulSoup(smartQuotes, convertEntities = 'html')
  758.         self.assertEquals(soup.encode('utf-8'), 'Il a dit, \xe2\x80\xb9Sacr\xc3\xa9 bleu!\xe2\x80\xba')
  759.  
  760.     
  761.     def testDontSeeSmartQuotesWhereThereAreNone(self):
  762.         utf_8 = '\xe3\x82\xb1\xe3\x83\xbc\xe3\x82\xbf\xe3\x82\xa4 Watch'
  763.         self.assertSoupEquals(utf_8, encoding = 'utf-8')
  764.  
  765.  
  766.  
  767. class Whitewash(SoupTest):
  768.     '''Test whitespace preservation.'''
  769.     
  770.     def testPreservedWhitespace(self):
  771.         self.assertSoupEquals('<pre>   </pre>')
  772.         self.assertSoupEquals('<pre> woo  </pre>')
  773.  
  774.     
  775.     def testCollapsedWhitespace(self):
  776.         self.assertSoupEquals('<p>   </p>', '<p> </p>')
  777.  
  778.  
  779. if __name__ == '__main__':
  780.     unittest.main()
  781.  
  782.